# import library
library(zoo)
## 
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(tidyr)
library(naniar)
## Warning: package 'naniar' was built under R version 4.4.3
library(mice)
## Warning: package 'mice' was built under R version 4.4.3
## 
## Attaching package: 'mice'
## The following object is masked from 'package:stats':
## 
##     filter
## The following objects are masked from 'package:base':
## 
##     cbind, rbind
library(lubridate)
## 
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
## 
##     date, intersect, setdiff, union
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 4.4.3
library(forcats) 
set.seed(5003)
# Read data
house_market = read.csv('Melbourne_housing_FULL.csv')
glimpse(house_market)
## Rows: 34,857
## Columns: 21
## $ Suburb        <chr> "Abbotsford", "Abbotsford", "Abbotsford", "Abbotsford", …
## $ Address       <chr> "68 Studley St", "85 Turner St", "25 Bloomburg St", "18/…
## $ Rooms         <int> 2, 2, 2, 3, 3, 3, 4, 4, 2, 2, 2, 3, 2, 4, 2, 3, 2, 1, 2,…
## $ Type          <chr> "h", "h", "h", "u", "h", "h", "h", "h", "h", "h", "h", "…
## $ Price         <int> NA, 1480000, 1035000, NA, 1465000, 850000, 1600000, NA, …
## $ Method        <chr> "SS", "S", "S", "VB", "SP", "PI", "VB", "SN", "S", "S", …
## $ SellerG       <chr> "Jellis", "Biggin", "Biggin", "Rounds", "Biggin", "Biggi…
## $ Date          <chr> "3/09/2016", "3/12/2016", "4/02/2016", "4/02/2016", "4/0…
## $ Distance      <chr> "2.5", "2.5", "2.5", "2.5", "2.5", "2.5", "2.5", "2.5", …
## $ Postcode      <chr> "3067", "3067", "3067", "3067", "3067", "3067", "3067", …
## $ Bedroom2      <int> 2, 2, 2, 3, 3, 3, 3, 3, 4, 3, 2, 4, 2, 6, 2, NA, NA, 1, …
## $ Bathroom      <int> 1, 1, 1, 2, 2, 2, 1, 2, 1, 2, 1, 2, 2, 2, 1, NA, NA, 1, …
## $ Car           <int> 1, 1, 0, 1, 0, 1, 2, 2, 2, 1, 0, 0, 1, 0, 2, NA, NA, 1, …
## $ Landsize      <int> 126, 202, 156, 0, 134, 94, 120, 400, 201, 202, 181, 245,…
## $ BuildingArea  <dbl> NA, NA, 79, NA, 150, NA, 142, 220, NA, NA, NA, 210, 82, …
## $ YearBuilt     <int> NA, NA, 1900, NA, 1900, NA, 2014, 2006, 1900, 1900, NA, …
## $ CouncilArea   <chr> "Yarra City Council", "Yarra City Council", "Yarra City …
## $ Lattitude     <dbl> -37.8014, -37.7996, -37.8079, -37.8114, -37.8093, -37.79…
## $ Longtitude    <dbl> 144.9958, 144.9984, 144.9934, 145.0116, 144.9944, 144.99…
## $ Regionname    <chr> "Northern Metropolitan", "Northern Metropolitan", "North…
## $ Propertycount <chr> "4019", "4019", "4019", "4019", "4019", "4019", "4019", …
# check unique value
sapply(house_market, n_distinct)
##        Suburb       Address         Rooms          Type         Price 
##           351         34009            12             3          2872 
##        Method       SellerG          Date      Distance      Postcode 
##             9           388            78           216           212 
##      Bedroom2      Bathroom           Car      Landsize  BuildingArea 
##            16            12            16          1685           741 
##     YearBuilt   CouncilArea     Lattitude    Longtitude    Regionname 
##           161            34         13403         14525             9 
## Propertycount 
##           343
# check the null value
colSums(is.na(house_market))
##        Suburb       Address         Rooms          Type         Price 
##             0             0             0             0          7610 
##        Method       SellerG          Date      Distance      Postcode 
##             0             0             0             0             0 
##      Bedroom2      Bathroom           Car      Landsize  BuildingArea 
##          8217          8226          8728         11810         21115 
##     YearBuilt   CouncilArea     Lattitude    Longtitude    Regionname 
##         19306             0          7976          7976             0 
## Propertycount 
##             0
# remove Unnecessary columns
house_market = house_market %>% select(-Method, -SellerG, -Bedroom2, -Postcode, -Regionname, -Propertycount)

impute the Bathroom, Car, YearBuilt, Landsize with median in different suburbs, because median will be impacted less by extrem value and the house is similar in same suburb.

# impute the bedroom, bathroom, car, YearBuilt, Landsize
house_market$Landsize[house_market$Landsize == 0] = NA # set 0 as NA
house_market = house_market %>%
  group_by(Suburb) %>%
  mutate(
    Bathroom = ifelse(is.na(Bathroom), median(Bathroom, na.rm = TRUE), Bathroom),
    Car       = ifelse(is.na(Car), median(Car, na.rm = TRUE), Car),
    Landsize = ifelse(is.na(Landsize), median(Landsize, na.rm = TRUE), Landsize),
    YearBuilt = ifelse(is.na(YearBuilt), median(YearBuilt, na.rm = TRUE), YearBuilt)
  ) %>%
  ungroup()

Using multiple imputation to impute the BuildingArea because this columns has large percentage Na value we need to use other column to predict

# MICE to impute the BuildingArea
impute_data = house_market[, c("Landsize", "BuildingArea")]
pred = make.predictorMatrix(impute_data)
pred[,] = 0
pred["BuildingArea", "Landsize"] = 1
imp = mice(impute_data, m = 5, method = "pmm", predictorMatrix = pred, maxit = 10, seed = 5003)
## 
##  iter imp variable
##   1   1  Landsize  BuildingArea
##   1   2  Landsize  BuildingArea
##   1   3  Landsize  BuildingArea
##   1   4  Landsize  BuildingArea
##   1   5  Landsize  BuildingArea
##   2   1  Landsize  BuildingArea
##   2   2  Landsize  BuildingArea
##   2   3  Landsize  BuildingArea
##   2   4  Landsize  BuildingArea
##   2   5  Landsize  BuildingArea
##   3   1  Landsize  BuildingArea
##   3   2  Landsize  BuildingArea
##   3   3  Landsize  BuildingArea
##   3   4  Landsize  BuildingArea
##   3   5  Landsize  BuildingArea
##   4   1  Landsize  BuildingArea
##   4   2  Landsize  BuildingArea
##   4   3  Landsize  BuildingArea
##   4   4  Landsize  BuildingArea
##   4   5  Landsize  BuildingArea
##   5   1  Landsize  BuildingArea
##   5   2  Landsize  BuildingArea
##   5   3  Landsize  BuildingArea
##   5   4  Landsize  BuildingArea
##   5   5  Landsize  BuildingArea
##   6   1  Landsize  BuildingArea
##   6   2  Landsize  BuildingArea
##   6   3  Landsize  BuildingArea
##   6   4  Landsize  BuildingArea
##   6   5  Landsize  BuildingArea
##   7   1  Landsize  BuildingArea
##   7   2  Landsize  BuildingArea
##   7   3  Landsize  BuildingArea
##   7   4  Landsize  BuildingArea
##   7   5  Landsize  BuildingArea
##   8   1  Landsize  BuildingArea
##   8   2  Landsize  BuildingArea
##   8   3  Landsize  BuildingArea
##   8   4  Landsize  BuildingArea
##   8   5  Landsize  BuildingArea
##   9   1  Landsize  BuildingArea
##   9   2  Landsize  BuildingArea
##   9   3  Landsize  BuildingArea
##   9   4  Landsize  BuildingArea
##   9   5  Landsize  BuildingArea
##   10   1  Landsize  BuildingArea
##   10   2  Landsize  BuildingArea
##   10   3  Landsize  BuildingArea
##   10   4  Landsize  BuildingArea
##   10   5  Landsize  BuildingArea
completed_data = complete(imp, 1)
house_market$BuildingArea = completed_data$BuildingArea

Remove the NA in Price because this column is target value removing can avoid leaking. Remove Na value in Lattitude and Longtitude these value is hard to impute

house_market = house_market %>%
  filter(
    !is.na(Lattitude),
    !is.na(Longtitude),
    !is.na(Price)
  )
# check the null value
colSums(is.na(house_market))
##       Suburb      Address        Rooms         Type        Price         Date 
##            0            0            0            0            0            0 
##     Distance     Bathroom          Car     Landsize BuildingArea    YearBuilt 
##            0            0            0            5            0           17 
##  CouncilArea    Lattitude   Longtitude 
##            0            0            0
# remove the Na value which can not be imputed
house_market = house_market %>%
  filter(
    !is.na(YearBuilt),
    !is.na(Landsize)
  )

Melbourne Housing Price Heatmap Analysis

# Add geospatial analysis libraries
library(sf)
## Linking to GEOS 3.12.2, GDAL 3.9.3, PROJ 9.4.1; sf_use_s2() is TRUE
library(leaflet)
## Warning: package 'leaflet' was built under R version 4.4.3
library(viridis)
## Warning: package 'viridis' was built under R version 4.4.3
## Loading required package: viridisLite
library(scales)
## Warning: package 'scales' was built under R version 4.4.3
## 
## Attaching package: 'scales'
## The following object is masked from 'package:viridis':
## 
##     viridis_pal
# Load shapefile data
shapefile_path <- "shapefile/SAL_2021_AUST_GDA94.shp"
all_suburbs <- st_read(shapefile_path, quiet = TRUE)

# Filter Victoria state suburbs
melbourne_suburbs <- all_suburbs[all_suburbs$STE_NAME21 == "Victoria", ]

print(paste("Victoria SAL contains", nrow(melbourne_suburbs), "suburbs"))
## [1] "Victoria SAL contains 2946 suburbs"
# Prepare individual house data for plotting
house_data <- house_market %>%
  filter(!is.na(Lattitude), !is.na(Longtitude), !is.na(Price)) %>%
  select(Address, Suburb, Price, Lattitude, Longtitude, Rooms, Type, Bathroom, Car, Landsize, BuildingArea, YearBuilt)

print(paste("Total individual houses for plotting:", nrow(house_data)))
## [1] "Total individual houses for plotting: 20974"
print(paste("Price range: $", format(min(house_data$Price), big.mark = ","), "to $", format(max(house_data$Price), big.mark = ",")))
## [1] "Price range: $ 85,000 to $ 11,200,000"
# Debug: Check house_data
print(paste("House data rows:", nrow(house_data)))
## [1] "House data rows: 20974"
print(paste("Price range:", min(house_data$Price, na.rm = TRUE), "to", max(house_data$Price, na.rm = TRUE)))
## [1] "Price range: 85000 to 11200000"
print("First few rows:")
## [1] "First few rows:"
head(house_data, 3)
## # A tibble: 3 × 12
##   Address Suburb  Price Lattitude Longtitude Rooms Type  Bathroom   Car Landsize
##   <chr>   <chr>   <int>     <dbl>      <dbl> <int> <chr>    <dbl> <dbl>    <dbl>
## 1 85 Tur… Abbot… 1.48e6     -37.8       145.     2 h            1     1      202
## 2 25 Blo… Abbot… 1.03e6     -37.8       145.     2 h            1     0      156
## 3 5 Char… Abbot… 1.46e6     -37.8       145.     3 h            2     0      134
## # ℹ 2 more variables: BuildingArea <dbl>, YearBuilt <dbl>
# Create individual house price scatter plot (optimized for display)
# Sample data for better performance in preview
set.seed(5003)
sample_data <- house_data[sample(nrow(house_data), min(5000, nrow(house_data))), ]

p1 <- ggplot(sample_data, aes(x = Longtitude, y = Lattitude, color = Price)) +
  geom_point(alpha = 0.7, size = 1) +
  scale_color_viridis_c(
    name = "Price\n(AUD)",
    labels = dollar_format(scale = 1e-3, suffix = "K"),
    option = "plasma",
    trans = "log10"
  ) +
  theme_minimal() +
  theme(
    plot.title = element_text(size = 16, hjust = 0.5),
    legend.position = "right",
    legend.title = element_text(size = 12),
    legend.text = element_text(size = 10)
  ) +
  labs(
    title = "Melbourne Individual House Price Distribution",
    subtitle = paste("Data Source: Melbourne Housing Dataset | Sample of", nrow(sample_data), "houses (from", nrow(house_data), "total)"),
    x = "Longitude",
    y = "Latitude",
    caption = "Note: Each point represents one house, darker colors indicate higher prices"
  )

# Display the plot
p1

# Calculate average housing prices for each suburb for boundary map
suburb_prices <- house_market %>%
  group_by(Suburb) %>%
  summarise(
    avg_price = mean(Price, na.rm = TRUE),
    median_price = median(Price, na.rm = TRUE),
    count = n(),
    .groups = 'drop'
  ) %>%
  arrange(desc(avg_price))

# Merge suburb data with shapefile
melbourne_suburbs$SAL_NAME_clean <- toupper(trimws(melbourne_suburbs$SAL_NAME21))
suburb_prices$Suburb_clean <- toupper(trimws(suburb_prices$Suburb))

merged_data <- melbourne_suburbs %>%
  left_join(suburb_prices, by = c("SAL_NAME_clean" = "Suburb_clean"))

heatmap_data <- merged_data %>%
  filter(!is.na(avg_price))

print(paste("Suburbs with price data for boundary map:", nrow(heatmap_data)))
## [1] "Suburbs with price data for boundary map: 249"
# Create interactive suburb boundary map
heatmap_data$popup_content <- paste0(
  "<strong>", heatmap_data$SAL_NAME21, "</strong><br/>",
  "Average Price: $", format(heatmap_data$avg_price, big.mark = ",", scientific = FALSE), "<br/>",
  "Median Price: $", format(heatmap_data$median_price, big.mark = ",", scientific = FALSE), "<br/>",
  "Number of Houses: ", heatmap_data$count
)

pal <- colorNumeric(
  palette = "plasma",
  domain = heatmap_data$avg_price,
  na.color = "transparent"
)

interactive_map <- leaflet(heatmap_data) %>%
  addTiles() %>%
  addPolygons(
    fillColor = ~pal(avg_price),
    fillOpacity = 0.7,
    color = "white",
    weight = 1,
    popup = ~popup_content,
    highlight = highlightOptions(
      weight = 3,
      color = "red",
      fillOpacity = 0.9,
      bringToFront = TRUE
    )
  ) %>%
  addLegend(
    pal = pal,
    values = ~avg_price,
    title = "Average Price (AUD)",
    position = "bottomright",
    labFormat = labelFormat(prefix = "$", big.mark = ",")
  )
## Warning: sf layer has inconsistent datum (+proj=longlat +ellps=GRS80 +no_defs).
## Need '+proj=longlat +datum=WGS84'
interactive_map
# Most expensive and cheapest individual houses
top_expensive <- house_data %>%
  arrange(desc(Price)) %>%
  head(10) %>%
  select(Address, Suburb, Price, Rooms, Type, Bathroom, Car, Landsize, BuildingArea, YearBuilt)

top_cheap <- house_data %>%
  arrange(Price) %>%
  head(10) %>%
  select(Address, Suburb, Price, Rooms, Type, Bathroom, Car, Landsize, BuildingArea, YearBuilt)

print("Top 10 Most Expensive Houses:")
## [1] "Top 10 Most Expensive Houses:"
print(top_expensive)
## # A tibble: 10 × 10
##    Address        Suburb  Price Rooms Type  Bathroom   Car Landsize BuildingArea
##    <chr>          <chr>   <int> <int> <chr>    <dbl> <dbl>    <dbl>        <dbl>
##  1 6 Cole St      Brigh… 1.12e7     4 h            3     2     1400         296 
##  2 35 Bevis St    Mulgr… 9   e6     3 h            1     1      744         117 
##  3 49 Mangarra Rd Cante… 8   e6     5 h            5     4     2079         464.
##  4 49 Lisson Gr   Hawth… 7.65e6     4 h            2     4     1690         284 
##  5 42 Hampden Rd  Armad… 6.80e6     4 h            2     2      335          98 
##  6 6 Moorakyne Av Malve… 6.60e6     5 h            3     2      396         465 
##  7 15 Barry St    Kew    6.5 e6     6 h            6     3     1334         365 
##  8 136 Page St    Middl… 6.40e6     5 h            2     1      553         308 
##  9 307 Beaconsfi… Middl… 6.37e6     4 h            3     3      476         279 
## 10 13 Wilks Av    Malve… 6.30e6     5 h            4     4      900         435 
## # ℹ 1 more variable: YearBuilt <dbl>
print("Top 10 Cheapest Houses:")
## [1] "Top 10 Cheapest Houses:"
print(top_cheap)
## # A tibble: 10 × 10
##    Address        Suburb  Price Rooms Type  Bathroom   Car Landsize BuildingArea
##    <chr>          <chr>   <int> <int> <chr>    <dbl> <dbl>    <dbl>        <dbl>
##  1 202/51 Gordon… Foots…  85000     1 u            1     0     266.          265
##  2 30 Pyne St     Caulf… 131000     4 h            1     2     499           155
##  3 171 Moreland … Coburg 145000     4 h            1     1     536           164
##  4 8/6 Ridley St  Albion 145000     1 u            1     1      36           106
##  5 17/17 Park St  Hawth… 160000     1 u            1     0     322           170
##  6 10/30 Pickett… Foots… 170000     1 u            1     0      30            26
##  7 4/831 Park St  Bruns… 170000     1 u            1     0    1250           130
##  8 10/117 Anders… Albion 185000     1 u            1     1    2347            43
##  9 8/132 Rupert … West … 185000     1 u            1     1     379           114
## 10 5/25 Ridley St Albion 190000     2 u            1     1     591           169
## # ℹ 1 more variable: YearBuilt <dbl>